Apply methods discussed in lecture to search for GxE interactions in the HELIX data set
BinaryDosage/GxEScanR
The HELIX study is a collaborative project across six longitudinal population-based birth cohort studies in six European countries (France, Greece, Lithuania, Norway, Spain, and the United Kingdom).
For the lab - subcohort of 1,301 mother-child pairs
Note that the genomic data used in this example is simulated.
Variables of interest:
HELIX.MultiAssayExperiment - R object that stores
multiple data sets measured on the same individuals
(e.g. exposure/outcome data, genomic, methylation, metabolites, etc)
Begin by obtaining variables of interest, combine into a
data.frame
# ---- Load RData containing HELIX MultiAssayExperiment object ---- #
# load.Rdata2(filename, path=getwd())
# assumes following data is in the current working directory
load("./data/HELIX.MultiAssayExperiment.RData")
# ---- specify variable names to extract ---- #
outcome.Name <- "hs_bmi_c_cat"
exposure.Name <- "e3_alcpreg_yn_None"
covariate.Names <- c("h_mbmi_None","e3_sex_None","h_age_None","h_cohort","h_edumc_None","ethn_PC1","ethn_PC2")
snp.Names <- paste("SNP", 1:1000, sep=".")
# putting it all together
variables <- c(covariate.Names, exposure.Name, "h_ethnicity_cauc", snp.Names)
# ---- extract variables from HELIX.MultiAssayExperiment ---- #
# 1) select variables to keep
# 2) `intersectionColumns` selects only individuals with complete data (N = 1,122)
# 3) `wideFormat` returns as a data.frame
# d <- wideFormat(intersectColumns(helix_ma[variables, ,]), colDataCols=outcome.Name)
# saveRDS(d, "d_alcohol.rds")
# assumes following file is in /interaction/data within current working directory
d <- readRDS("./data/d_alcohol.rds")
# ---- Genotype design matrix ---- #
X <- d[,grep("SNP", names(d))]
names(X) <- snp.Names
X <- as.matrix(X)
# ---- Epi data.frame ---- #
dataset <- data.frame(d[,c("primary", "hs_bmi_c_cat", "lifestyles_e3_alcpreg_yn_None",
"covariates_h_mbmi_None", "covariates_e3_sex_None", "covariates_h_age_None",
"covariates_h_cohort", "covariates_h_edumc_None", "proteome.cov_ethn_PC1",
"proteome.cov_ethn_PC2", "proteome.cov_h_ethnicity_cauc")])
# Rename columns for this example
names(dataset) <- c("primary", "hs_bmi_c_cat", "e3_alcpreg_yn_None", "h_mbmi_None", "e3_sex_None", "h_age_None", "h_cohort", "h_edumc_None", "ethn_PC1", "ethn_PC2", "h_ethnicity_cauc")
# Dichotomize outcome variable
dataset$hs_bmi_c_cat <- factor(ifelse(as.numeric(dataset$hs_bmi_c_cat)>=3, 1, 0), label = c("Thin/Normal", "Overweight/Obese"))
# Code variables properly (numeric vs factor)
dataset[, c("h_mbmi_None", "h_age_None", "ethn_PC1", "ethn_PC2")] <- lapply(dataset[, c("h_mbmi_None", "h_age_None", "ethn_PC1", "ethn_PC2")], as.numeric)
dataset[, c("e3_alcpreg_yn_None", "e3_sex_None", "h_cohort")] <- lapply(dataset[, c("e3_alcpreg_yn_None", "e3_sex_None", "h_cohort")], factor)
dataset$h_edumc_None <- factor(dataset$h_edumc_None, label = c("Primary school", "Secondary school", "University degree or higher"))
# Other variables for analysis (for convenience)
N <- nrow(d) # number of individuals in the analysis
P <- ncol(X) # number of SNPs in the matrix X
# various ways of generating descriptive stats, personally using `table1` package
library(table1)
label(dataset$e3_alcpreg_yn_None) <- "Maternal alcohol during pregnancy"
label(dataset$h_age_None) <- "Maternal age"
label(dataset$e3_sex_None) <- "Child sex"
label(dataset$h_mbmi_None) <- "Maternal pre-pregnancy BMI"
label(dataset$h_edumc_None) <- "Maternal education"
# run table1 function
table1(~ e3_alcpreg_yn_None +
h_age_None +
e3_sex_None +
h_edumc_None +
h_mbmi_None | hs_bmi_c_cat, data=dataset)
| Thin/Normal (N=786) |
Overweight/Obese (N=336) |
Overall (N=1122) |
|
|---|---|---|---|
| Maternal alcohol during pregnancy | |||
| 0 | 524 (66.7%) | 250 (74.4%) | 774 (69.0%) |
| 1 | 262 (33.3%) | 86 (25.6%) | 348 (31.0%) |
| Maternal age | |||
| Mean (SD) | 30.6 (4.70) | 31.0 (5.26) | 30.7 (4.87) |
| Median [Min, Max] | 31.0 [16.0, 43.5] | 31.0 [16.0, 43.5] | 31.0 [16.0, 43.5] |
| Child sex | |||
| female | 366 (46.6%) | 157 (46.7%) | 523 (46.6%) |
| male | 420 (53.4%) | 179 (53.3%) | 599 (53.4%) |
| Maternal education | |||
| Primary school | 110 (14.0%) | 50 (14.9%) | 160 (14.3%) |
| Secondary school | 248 (31.6%) | 142 (42.3%) | 390 (34.8%) |
| University degree or higher | 428 (54.5%) | 144 (42.9%) | 572 (51.0%) |
| Maternal pre-pregnancy BMI | |||
| Mean (SD) | 24.4 (4.87) | 26.8 (5.39) | 25.1 (5.15) |
| Median [Min, Max] | 23.4 [16.2, 44.8] | 25.4 [17.2, 45.5] | 24.1 [16.2, 45.5] |
univariate_vars <- c("e3_alcpreg_yn_None", "h_age_None", "e3_sex_None", "h_edumc_None", "h_mbmi_None")
univariate.results <- t(sapply(univariate_vars, FUN=function(p) { # using index p facilitate write
x <- dataset[,p]
reg <- glm(dataset$hs_bmi_c_cat ~ as.numeric(x), family=binomial) # perform logistic regression
s.reg <- summary(reg) # get the summary for the regression
c.reg <- s.reg$coef[2,] # select the coefficients for the exposure
return(c.reg)
}, simplify=T))
univariate.results <- data.frame(univariate_vars,univariate.results)
names(univariate.results) <- c("Variable","Estimate", "SD","Z.Statistic", "P-value")
rownames(univariate.results) <- NULL
univariate.results$`P-value` <- formatC(univariate.results$`P-value`, format = "e", digits = 2)
kable(univariate.results, digits = 2) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| Variable | Estimate | SD | Z.Statistic | P-value |
|---|---|---|---|---|
| e3_alcpreg_yn_None | -0.37 | 0.15 | -2.56 | 1.05e-02 |
| h_age_None | 0.02 | 0.01 | 1.19 | 2.35e-01 |
| e3_sex_None | -0.01 | 0.13 | -0.05 | 9.60e-01 |
| h_edumc_None | -0.24 | 0.09 | -2.65 | 7.95e-03 |
| h_mbmi_None | 0.09 | 0.01 | 6.83 | 8.75e-12 |
Simulated 1000 SNPs
PC1 vs PC2. Note clustering of NHW vs Others
plot(d$proteome.cov_ethn_PC1, d$proteome.cov_ethn_PC2, pch=16, col=ifelse(d$proteome.cov_h_ethnicity_cauc=="yes", 1, 2),
xlab="Component 1", ylab="Component 2")
legend(x="topleft", legend=c("Caucasian", "Other"), col=c(1,2), pch=16)
Simulated correlated structure to approximate real data structure
cormat <- round(cor(X[,1:(P/5)], use="complete.obs"), 2)
cormat[lower.tri(cormat)]<- NA
melted_cormat <- melt(cormat)
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_blank(), axis.text.y = element_blank())+
labs(y= "SNPs", x = "SNPs")+
coord_fixed()
Most commonly used method for testing for interaction - fit traditional logistic regression model with interaction term in the form \[logit\left(P(D=1|G,E)\right) = \alpha + \beta_EE + \beta_GG + \beta_{GxE}G\times E\]
\(H0:\beta_{GxE} = 0\) is equivalent to testing \(\frac{OR_{GxE}}{OR_G*OR_E} = 1\) e.g. departure from multiplicative scale interaction
Example - SNP.1 x e3_alcpreg_yn_None
NOTE - we will adjust all models by
SNP.1 <- X[,1] # getting SNP from genotype design matrix
model <- glm(hs_bmi_c_cat ~ e3_alcpreg_yn_None * SNP.1 + h_age_None + h_mbmi_None + h_cohort + h_edumc_None + e3_sex_None + ethn_PC1 + ethn_PC2, data = dataset, family = 'binomial')
kable(tidy(model), digits = 2) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | -5.15 | 0.68 | -7.56 | 0.00 |
| e3_alcpreg_yn_None1 | -0.24 | 0.20 | -1.22 | 0.22 |
| SNP.1 | 0.02 | 0.14 | 0.15 | 0.88 |
| h_age_None | 0.02 | 0.02 | 1.57 | 0.12 |
| h_mbmi_None | 0.11 | 0.01 | 7.64 | 0.00 |
| h_cohort2 | 1.07 | 0.32 | 3.36 | 0.00 |
| h_cohort3 | 1.39 | 0.30 | 4.69 | 0.00 |
| h_cohort4 | 0.59 | 0.31 | 1.88 | 0.06 |
| h_cohort5 | 0.44 | 0.33 | 1.31 | 0.19 |
| h_cohort6 | 1.31 | 0.29 | 4.58 | 0.00 |
| h_edumc_NoneSecondary school | 0.04 | 0.24 | 0.18 | 0.86 |
| h_edumc_NoneUniversity degree or higher | -0.26 | 0.24 | -1.09 | 0.27 |
| e3_sex_Nonemale | 0.00 | 0.14 | -0.02 | 0.98 |
| ethn_PC1 | 0.00 | 0.01 | -0.18 | 0.86 |
| ethn_PC2 | -0.01 | 0.01 | -0.55 | 0.58 |
| e3_alcpreg_yn_None1:SNP.1 | 0.18 | 0.25 | 0.73 | 0.47 |
The interaction term e3_alcpreg_yn_None1:SNP.1 has
p.value = 0.47. Cannot reject null hypothesis and do NOT conclude there
is statistically significant interaction on multiplicative scale.
Case-only models can be more powerful than traditional logistic regression models, provided the assumption of G-E independence in the population is met.
One can use a logistic regression model with form \[logit(P(E=1|G, D=1)) = \gamma_0 + \gamma_GG\]
caseonly <- which(dataset$hs_bmi_c_cat == "Overweight/Obese")
SNP.1 <- (X[,1][caseonly])
dataset_caseonly <- dataset[caseonly, ]
# model_caseonly <- multinom(SNP.1 ~ e3_alcpreg_yn_None + h_age_None + h_mbmi_None + h_cohort + h_edumc_None + e3_sex_None + ethn_PC1 + ethn_PC2, data = dataset_caseonly)
model_caseonly <- glm(e3_alcpreg_yn_None ~ SNP.1 + h_age_None + h_mbmi_None + h_cohort + h_edumc_None + e3_sex_None + ethn_PC1 + ethn_PC2, family = binomial, data = dataset_caseonly)
kable(tidy(model_caseonly), digits = 2) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1.29 | 1.36 | 0.95 | 0.34 |
| SNP.1 | 0.05 | 0.22 | 0.23 | 0.82 |
| h_age_None | 0.02 | 0.03 | 0.79 | 0.43 |
| h_mbmi_None | -0.05 | 0.03 | -1.50 | 0.13 |
| h_cohort2 | -1.23 | 0.57 | -2.16 | 0.03 |
| h_cohort3 | -2.28 | 0.56 | -4.08 | 0.00 |
| h_cohort4 | -3.76 | 0.77 | -4.88 | 0.00 |
| h_cohort5 | -1.21 | 0.61 | -2.00 | 0.05 |
| h_cohort6 | -1.80 | 0.56 | -3.23 | 0.00 |
| h_edumc_NoneSecondary school | -0.02 | 0.45 | -0.04 | 0.97 |
| h_edumc_NoneUniversity degree or higher | -0.12 | 0.44 | -0.28 | 0.78 |
| e3_sex_Nonemale | -0.20 | 0.29 | -0.70 | 0.48 |
| ethn_PC1 | -0.03 | 0.02 | -1.08 | 0.28 |
| ethn_PC2 | -0.08 | 0.03 | -2.75 | 0.01 |
If the exposure is rare, we can check for G-E independence by testing their association among controls (model included below for illustration)
Note that high BMI is NOT rare! In real analyses, suggest consulting BMI GWAS study results to check for G-E independence (e.g. GIANT consortium)
controlonly <- which(dataset$hs_bmi_c_cat == "Thin/Normal")
SNP.1 <- X[,1][controlonly]
dataset_controlonly <- dataset[controlonly, ]
model_caseonly <- lm(SNP.1 ~ e3_alcpreg_yn_None + h_age_None + h_mbmi_None + h_cohort + h_edumc_None + e3_sex_None + ethn_PC1 + ethn_PC2, data = dataset_controlonly)
kable(tidy(model_caseonly), digits = 2) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.59 | 0.20 | 3.00 | 0.00 |
| e3_alcpreg_yn_None1 | 0.03 | 0.05 | 0.70 | 0.48 |
| h_age_None | -0.01 | 0.00 | -1.59 | 0.11 |
| h_mbmi_None | 0.00 | 0.00 | 0.27 | 0.79 |
| h_cohort2 | 0.12 | 0.09 | 1.27 | 0.21 |
| h_cohort3 | 0.16 | 0.09 | 1.84 | 0.07 |
| h_cohort4 | 0.21 | 0.09 | 2.36 | 0.02 |
| h_cohort5 | 0.20 | 0.09 | 2.22 | 0.03 |
| h_cohort6 | 0.03 | 0.08 | 0.38 | 0.70 |
| h_edumc_NoneSecondary school | -0.11 | 0.07 | -1.46 | 0.14 |
| h_edumc_NoneUniversity degree or higher | -0.15 | 0.07 | -2.12 | 0.03 |
| e3_sex_Nonemale | 0.05 | 0.04 | 1.21 | 0.23 |
| ethn_PC1 | 0.01 | 0.00 | 1.55 | 0.12 |
| ethn_PC2 | 0.01 | 0.00 | 2.16 | 0.03 |
BinaryDosage + GxEScanRImputation is a useful tool that enables association testing with markers not directly genotyped, increasing statistical power and facilitating data pooling between studies. Genome-wide interaction scans are computationally expensive when the number of markers exceed several million.
BinaryDosage/GxEScanR is a suite of
R packages that efficiently performs interaction scans of
imputed data, implementing several of the methods discussed in
lecture.
BinaryDosage - converts imputed genomic data into binary
format, facilitating data storage, management, and analysis
GxEScanR - efficiently runs GWAS and GWIS using imputed
genotypes stored in the BinaryDosage format. The phenotypes
and exposures can be continuous or binary trait
BinaryDosage takes imputed genotypes in the following
formats:
VCF/INFO format
(Minimac3/4, Michigan Imputation Server)GEN/Sample format
(Impute2)GxEScanR requires the following files to conduct
GWAS/GWIS:
BinaryDosage fileBelow, I create the required files manually for illustration
purposes. Refer to https://samtools.github.io/hts-specs/VCFv4.2.pdf for
VCF file specifications
VCF file example
INFO file example
# ---- VCF/INFO files ---- #
# generate VCF file using design matrix as input
vcf <- data.frame(t(X)) # transpose
# dummy values for this example only
vcf$chr <- "chr1"
vcf$pos <- seq(1,1000)
vcf$ID <- snp.Names
vcf$FORMAT <- "DS"
vcf[c('REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'Sample')] <- rep('.', nrow(vcf))
vcf <- vcf[, c('chr', 'pos', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')]
colnames(vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
vcfout <- cbind(vcf, data.frame(t(X)))
# write.table(vcfout, "./data/alcohol.vcf", quote = F, row.names = F, col.names = T, sep = "\t")
# Generate INFO file
info <- vcf[, c('ID', 'REF', 'ALT')]
info[c('AAF', 'MAF', 'AvgCall', 'Rsq')] <- rep(0.9, nrow(vcf))
info$Genotyped = "imputed"
info[c('LooRsq', 'EmpR', 'EmpRsq', 'Dose0', 'Dose1')] = "-"
info <- info[, c('ID', 'REF', 'ALT', 'AAF', 'MAF', 'AvgCall', 'Rsq', 'Genotyped', 'LooRsq', 'EmpR', 'EmpRsq', 'Dose0', 'Dose1')]
colnames(info) <- c('SNP', 'REF(0)', 'ALT(1)', 'ALT_Frq', 'MAF', 'AvgCall', 'Rsq', 'Genotyped', 'LooRsq', 'EmpR', 'EmpRsq', 'Dose0', 'Dose1')
# write.table(info, "./data/alcohol.info", quote = F, row.names = F, col.names = T, sep = "\t")
# ---------------------------------- #
# ---- output BinaryDosage file ---- #
# ---------------------------------- #
library(BinaryDosage)
# BinaryDosage::vcftobd(vcffiles = c("interaction/data/alcohol.vcf", './data/alcohol.info'), gz = FALSE, bdfiles = "./data/alcohol.bdose")
# ------------------------------- #
# ---- output covariate file ---- #
# ------------------------------- #
# create covariate file with indicator variables for factors
# make sure it fits requirements specified in GxEScanR documentation
covariates <- c("hs_bmi_c_cat", "h_mbmi_None",
"e3_sex_None", "h_age_None", "h_cohort", "h_edumc_None", "ethn_PC1",
"ethn_PC2", "e3_alcpreg_yn_None")
# create indicator variables
dataset_gxescan <- data.frame(model.matrix(as.formula(paste("~-1+", paste(covariates, collapse = "+"))), data = dataset))
dataset_gxescan <- dataset_gxescan[ , -which(colnames(dataset_gxescan) %in% "hs_bmi_c_catThin.Normal")]
ID <- paste0("X", seq(1, 1122, 1))
dataset_gxescan <- cbind(ID, dataset_gxescan)
# write.table(dataset_gxescan, "./data/epidata_alcohol.txt", row.names = F, quote = F, sep = '\t')
library(GxEScanR)
# Run GxEScanR
# following code block from John by email on 4/5/22
# remove.packages("BinaryDosage")
# install.packages("BinaryDosage")
library(BinaryDosage)
starttime <- Sys.time()
x <- getbdinfo("./data/alcohol.bdose")
endtime <- Sys.time()
endtime - starttime
## Time difference of 0.004806757 secs
bdinfo <- BinaryDosage::getbdinfo("./data/alcohol.bdose")
epidata <- read.table("./data/epidata_alcohol.txt", stringsAsFactors = F, header = T)
start = Sys.time()
output <- GxEScanR::gweis(data = epidata, bdinfo = bdinfo)
## [1] "1122 subjects have complete data"
stop = Sys.time() - start; stop
## Time difference of 5.224196 mins
# create EDGE statistic for two-step method
output$lrtedge = output$lrtdg + output$lrteg
A quick look at GxEScanR output:
kable(head(output)) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | betadg | lrtdg | betagxe | lrtgxe | lrt2df | betaeg | lrteg | lrt3df | betacase | lrtcase | betactrl | lrtctrl | lrtedge |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SNP.1 | 0.0749726 | 0.4211911 | 0.1818170 | 0.5240537 | 0.9452448 | 0.0658514 | 0.3158712 | 1.2611160 | 0.0516503 | 0.0532057 | 0.1005141 | 0.5081546 | 0.7370623 |
| SNP.2 | 0.0365660 | 0.0724968 | 0.4301339 | 2.1101264 | 2.1826232 | 0.0868622 | 0.3930484 | 2.5756716 | 0.2260055 | 0.7162641 | 0.0402127 | 0.0595261 | 0.4655452 |
| SNP.3 | 0.0370628 | 0.1256276 | 0.3730735 | 2.5363256 | 2.6619532 | 0.0590307 | 0.3019609 | 2.9639140 | 0.2247721 | 1.1678512 | 0.0027905 | 0.0004770 | 0.4275884 |
| SNP.4 | 0.0808729 | 0.6983601 | 0.2653143 | 1.5366766 | 2.2350366 | 0.0188641 | 0.0365518 | 2.2715884 | 0.0397589 | 0.0418872 | 0.0174630 | 0.0226128 | 0.7349118 |
| SNP.5 | 0.0686623 | 0.4090673 | 0.4611555 | 3.6694038 | 4.0784711 | 0.0276943 | 0.0655006 | 4.1439717 | 0.1733491 | 0.6252844 | -0.0188130 | 0.0221728 | 0.4745679 |
| SNP.6 | -0.0616904 | 0.1218617 | 0.2003390 | 0.2828116 | 0.4046733 | -0.0472336 | 0.0679839 | 0.4726572 | -0.0362884 | 0.0104481 | -0.0509180 | 0.0566850 | 0.1898456 |
GxEScanR output contains the following:
betadg, lrtdg - GWAS estimate + LRT
statisticbetagxe, lrtgxe - GxE estimate + LRT
statistic (traditional logistic regression)lrt2df - 2DF LRT statisticbetaeg, lrteg - G-E association estimate +
LRT statistic in combined case+control samplelrt3df - 3DF LRT statisticbetacase, lrtcase - G-E association
estimate + LRT statistic in cases only (case-only analysis)betactrl, lrtctrl - G-E association
estimate + LRT statistic in controls only
We can look at the Q-Q / Manhattan plots of genotype main effects:
pvalues <- pchisq(output[, 'lrtdg'], df = 1, lower.tail = F)
r <- gcontrol2(pvalues, pch=16)
lambda <- round(r$lambda,3)
text(x=1, y=5, labels=bquote(lambda == .(lambda)), cex=2)
neglog.pvalues <- -log10(pchisq(output[,"lrtdg"], lower.tail = F, df = 1))
plot(1:nrow(output), neglog.pvalues,
pch=16, xaxt="n", ylim=c(0, max(neglog.pvalues, 3)),
ylab="-log(p-value)", xlab="SNPs")
abline(h=-log10(0.05/nrow(output)), lty=2, lwd=2, col=2)
GxE interaction Q-Q / Manhattan plots
pvalues <- pchisq(output[, 'lrtgxe'], df = 1, lower.tail = F)
r <- gcontrol2(pvalues, pch=16)
lambda <- round(r$lambda,3)
text(x=1, y=2.5, labels=bquote(lambda == .(lambda)), cex=2)
neglog.pvalues <- -log10(pchisq(output[,"lrtgxe"], lower.tail = F, df = 1))
plot(1:nrow(output), neglog.pvalues,
pch=16, xaxt="n", ylim=c(0, max(neglog.pvalues, 3)),
ylab="-log(p-value)", xlab="SNPs")
abline(h=-log10(0.05/nrow(output)), lty=2, lwd=2, col=2)
2DF test (Kraft et al 2007)
\[logit(P(D=1|G,E)) = \alpha + \beta_EE + \beta_GG + \beta_{GxE}G \times E\] where we test the hypothesis \(\beta_G = \beta_{GxE} = 0\)
pvalues <- pchisq(output[, 'lrt2df'], df = 2, lower.tail = F)
r <- gcontrol2(pvalues, pch=16)
lambda <- round(r$lambda,3)
text(x=1, y=5, labels=bquote(lambda == .(lambda)), cex=2)
neglog.pvalues <- -log10(pchisq(output[,"lrt2df"], lower.tail = F, df = 2))
plot(1:nrow(output), neglog.pvalues,
pch=16, xaxt="n", ylim=c(0, max(neglog.pvalues, 3)),
ylab="-log(p-value)", xlab="SNPs")
abline(h=-log10(0.05/nrow(output)), lty=2, lwd=2, col=2)
Test the hypothesis \(\beta_G = \beta_{GxE} = \gamma_G = 0\)
where \(\gamma_G\) is the G vs E association in combined case/control sample (see Murcray 2009, Gauderman 2019)
pvalues <- pchisq(output[, 'lrt3df'], df = 3, lower.tail = F)
r <- gcontrol2(pvalues, pch=16)
lambda <- round(r$lambda,3)
text(x=1, y=5, labels=bquote(lambda == .(lambda)), cex=2)
neglog.pvalues <- -log10(pchisq(output[,"lrt3df"], lower.tail = F, df = 3))
plot(1:nrow(output), neglog.pvalues,
pch=16, xaxt="n", ylim=c(0, max(neglog.pvalues, 3)),
ylab="-log(p-value)", xlab="SNPs")
abline(h=-log10(0.05/nrow(output)), lty=2, lwd=2, col=2)
We can use marginal statistics (D|G main effect) to filter SNPs prior to conducting GxE testing. For example, filter SNPs based on their D|G statistic at a \(\alpha < 0.05\) level:
# calculate p-values, filter at alpha < 0.05 threshold
output$step1_pvalue <- pchisq(output$lrtdg, lower.tail = F, df = 1)
output$step2_pvalue <- pchisq(output$lrtgxe, lower.tail = F, df = 1)
output_filter <- output[which(output$step1_pvalue < 0.05), ]
# check GxE significance in smaller subset (N = 62)
output_filter$pvalue_bonferroni <- p.adjust(output_filter$step2_pvalue, method = 'bonferroni')
output_filter_significant <- output_filter[which(output_filter$pvalue_bonferroni < 0.05),]
output_filter_significant # EMPTY
Why did we choose \(\alpha < 0.05\) for subset testing? Is there an optimal choice?
Rather than choosing an \(\alpha\) cut-off, we can include all SNPs by using the step 1 statistic to weight step 2 testing. This way, we do not need to make a decision regarding the appropriate step 1 p-value cutoff, while still maintaining overall \(\alpha = 0.05\).
We use the weighted hypothesis testing framework described in Ionita-Laza et al (2007). Specifically, we partition SNPs into exponentially larger bins (with an initial bin size of 5), each with an increasingly more stringent step 2 significance threshold. Essentially, we are giving ‘preferential treatment’ to SNPs with strong step 1 statistics, and testing them at a lower interaction significance threshold.
The partitioning scheme from Ionita-Laza is as follows:
Now onto our results - we partition SNPs into bins and check for significance
# ---- create exponential bins ---- #
sizeBin0 = 5 # initial bin size = 5
m = 997 # number of SNPs
nbins = ceiling(log2(m/sizeBin0 + 1)) # 8 bins to accommodate 1000 SNPs
sizeBin = c(sizeBin0 * 2^(0:(nbins-2)), m - sizeBin0 * (2^(nbins-1) - 1) )
# ---- sort by step 1 statistic (in this case, lrtdg), assign bins ---- #
endpointsBin = cumsum(sizeBin)
bin <- ceiling(log(c(1:m)/sizeBin0+1,base=2))
alphaBin = 0.05 * 2 ^ -(1:nbins) / sizeBin
alphaBin_dat <- rep(alphaBin, table(bin))
# ---- some checks ---- #
table(bin) # bin sizes
## bin
## 1 2 3 4 5 6 7 8
## 5 10 20 40 80 160 320 362
table(alphaBin) # bin step 2 sig threshold
## alphaBin
## 5.3953729281768e-07 1.220703125e-06 4.8828125e-06 1.953125e-05
## 1 1 1 1
## 7.8125e-05 0.0003125 0.00125 0.005
## 1 1 1 1
sum(alphaBin * sizeBin) # overall alpha
## [1] 0.04980469
# ---- assign bins, check for significance ---- #
# sort by lrtdg statistic
output_twostep <- output[order(-output[,'lrtdg']), ]
output_twostep$bin_number <- bin
output_twostep$step2p_sig <- as.numeric(alphaBin_dat)
# IN THIS CASE - NO significant interactions
output_twostep_significant <- output_twostep[which(output_twostep$step2p_pvalue < output_twostep$step2p_sig),]
kable(output_twostep_significant) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
| snp | betadg | lrtdg | betagxe | lrtgxe | lrt2df | betaeg | lrteg | lrt3df | betacase | lrtcase | betactrl | lrtctrl | lrtedge | step1_pvalue | step2_pvalue | bin_number | step2p_sig |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Visualizing two-step method bins + SNPs is helpful:
# calculate log pvalues for plotting
output_twostep$log_step2p <- -log10(output_twostep$step2_pvalue)
output_twostep$log_step2p_siglvl <- -log10(output_twostep$step2p_sig)
output_twostep_plot <- split(output_twostep, f = list(output_twostep$bin_number))
# create 'base pair' for each bin so SNPs are plotting evenly
create_mapinfo <- function(x) {
mapinfo <- seq(unique(x$bin_number) - 1 + 0.1, unique(x$bin_number) - 1 + 0.9, length.out = nrow(x))
out <- cbind(x, mapinfo)
return(out)
}
output_twostep_plot <- lapply(output_twostep_plot, create_mapinfo)
# PLOT
binsToPlot = length(output_twostep_plot)
color <- rep(c("#377EB8","#4DAF4A"),100)
par(mar=c(6, 7, 6, 3))
bin_to_plot = output_twostep_plot[[1]]
plot(bin_to_plot$mapinfo, bin_to_plot$log_step2p,
col = ifelse(bin_to_plot$snp %in% output_twostep_significant[, 'snp'], '#E41A1C','#377EB8'),
pch = ifelse(bin_to_plot$snp %in% output_twostep_significant[, 'snp'], 19, 20),
cex = ifelse(bin_to_plot$snp %in% output_twostep_significant[, 'snp'], 1.3, 1.7),
xlab="Bin number for step1 p value",
ylab="-log10(step2 chiSqGxE p value)",
xlim=c(0, binsToPlot),
ylim=c(0, 12),
axes=F,
cex.main = 1.7,
cex.axis = 1.7,
cex.lab = 1.7,
cex.sub = 1.7)
lines(bin_to_plot$mapinfo, bin_to_plot$log_step2p_siglvl, col = "black", lwd=1)
# remaining bins
for(i in 2:binsToPlot) {
bin_to_plot = output_twostep_plot[[i]]
points(bin_to_plot$mapinfo, bin_to_plot$log_step2p,
col = ifelse(bin_to_plot$snp %in% output_twostep_significant$snp, '#E41A1C', color[i]),
pch = ifelse(bin_to_plot$snp %in% output_twostep_significant$snp, 19, 20),
cex = ifelse(bin_to_plot$snp %in% output_twostep_significant$snp, 1.3, 1.7),
cex.main = 1.7,
cex.axis = 1.7,
cex.lab = 1.7,
cex.sub = 1.7)
lines(bin_to_plot$mapinfo, bin_to_plot$log_step2p_sig, col = "black",lwd = 1)
}
axis(1, at = c(-1.5, seq(0.5, binsToPlot-0.2, 1)), label = c(0, seq(1, binsToPlot, 1)), cex.axis = 1.7)
axis(2, at = c(0:floor(12)), label = c(0:12), cex.axis=1.7)
Let’s look at all the results for alcohol X gene analysis now. I
wrapped all the steps above in functions for convenience, you can refer
to them in the file interaction_functions.R
plot_qq(output, 'lrtdg', 1)
plot_qq(output, 'lrtgxe', 1, print_yloc = 2.5)
plot_qq(output, 'lrtcase', 1, print_yloc = 2)
plot_qq(output, 'lrt2df', 2)
plot_qq(output, 'lrt3df', 3)
plot_manhattan(output, 'lrtdg', 1)
kable(sig_table(output, 'lrtdg', 1)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | lrtdg_pval | lrtgxe_pval | lrt2df_pval | lrteg_pval | lrtctrl_pval | lrtcase_pval | lrt3df_pval | |
|---|---|---|---|---|---|---|---|---|
| 181 | SNP.183 | 1.1459e-07 | 6.8429e-01 | 7.2446e-07 | 9.4921e-01 | 8.3911e-01 | 6.2487e-01 | 3.1727e-06 |
| 276 | SNP.278 | 8.3274e-07 | 8.1480e-01 | 5.1954e-06 | 3.2564e-01 | 4.9653e-01 | 3.0262e-01 | 1.3353e-05 |
| 295 | SNP.297 | 6.6784e-11 | 9.4069e-01 | 5.5712e-10 | 7.1133e-01 | 9.1078e-01 | 8.6978e-01 | 2.7762e-09 |
| 499 | SNP.502 | 7.6514e-06 | 3.8074e-01 | 3.0564e-05 | 4.4388e-01 | 1.8817e-01 | 4.1470e-01 | 8.7878e-05 |
| 570 | SNP.573 | 3.7539e-23 | 8.1420e-01 | 4.5809e-22 | 4.7419e-01 | 8.9811e-01 | 4.8411e-01 | 2.8401e-21 |
| 648 | SNP.651 | 1.0438e-08 | 6.2329e-01 | 6.8281e-08 | 6.9320e-01 | 6.0614e-01 | 8.1540e-01 | 2.9872e-07 |
| 939 | SNP.942 | 4.2118e-05 | 9.3578e-02 | 5.5864e-05 | 9.4428e-01 | 4.5410e-01 | 5.3258e-01 | 2.0640e-04 |
plot_manhattan(output, 'lrtgxe', 1)
kable(sig_table(output, 'lrtgxe', 1)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | lrtdg_pval | lrtgxe_pval | lrt2df_pval | lrteg_pval | lrtctrl_pval | lrtcase_pval | lrt3df_pval |
|---|---|---|---|---|---|---|---|
plot_manhattan(output, 'lrtcase', 1)
plot_manhattan(output, 'lrt2df', 2)
plot_manhattan(output, 'lrt3df', 3)
plot_twostep(output, step1_statistic = 'lrtdg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)
kable(sig_twostep(output, step1_statistic = 'lrtdg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | step1_pval | step2_pval |
|---|---|---|
plot_twostep(output, step1_statistic = 'lrteg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)
kable(sig_twostep(output, step1_statistic = 'lrteg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | step1_pval | step2_pval |
|---|---|---|
plot_twostep(output, step1_statistic = 'lrtedge', step1_df = 2, sizeBin0 = 5, alpha = 0.05)
kable(sig_twostep(output, step1_statistic = 'lrtedge', step1_df = 2, sizeBin0 = 5, alpha = 0.05)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | step1_pval | step2_pval |
|---|---|---|
Your turn
Follow the analysis pipeline in G x Maternal Alcohol use and
childhood BMI, perform GxE analysis using GxEScanR
to assess interactions between maternal smoking + genetic variants in
determining childhood BMI. All necessary files and functions are
provided for expediency. Below are the two questions we are trying to
answer:
library(BinaryDosage)
library(GxEScanR)
# run GxEScanR
bdinfo <- BinaryDosage::getbdinfo("data/smoking.bdose")
epidata <- read.table("data/epidata_smoking.txt", stringsAsFactors = F, header = T)
output <- GxEScanR::gweis(data = epidata, bdinfo = bdinfo)
## [1] "1122 subjects have complete data"
output$lrtedge = output$lrtdg + output$lrteg
Focus on GxE + two-step methods
# GxE
plot_manhattan(output, 'lrtgxe', 1)
kable(sig_table(output, 'lrtgxe', 1)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | lrtdg_pval | lrtgxe_pval | lrt2df_pval | lrteg_pval | lrtctrl_pval | lrtcase_pval | lrt3df_pval | |
|---|---|---|---|---|---|---|---|---|
| 897 | SNP.900 | 9.7750e-03 | 2.4526e-05 | 4.8418e-06 | 3.9287e-01 | 6.5926e-03 | 1.5440e-04 | 1.3979e-05 |
| 940 | SNP.943 | 2.7853e-02 | 1.2948e-10 | 9.5035e-11 | 4.8494e-06 | 6.0080e-01 | NA | 1.8276e-14 |
# Two-step method - filter by genetic main effect
plot_twostep(output, step1_statistic = 'lrtdg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)
kable(sig_twostep(output, step1_statistic = 'lrtdg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | step1_pval | step2_pval | |
|---|---|---|---|
| 276 | SNP.278 | 9.5299e-07 | 2.6613e-04 |
| 499 | SNP.502 | 1.0324e-05 | 1.6348e-04 |
| 897 | SNP.900 | 9.7750e-03 | 2.4526e-05 |
| 940 | SNP.943 | 2.7853e-02 | 1.2948e-10 |
# Two-step method - filter by gene | exposure association among combined case/control population
plot_twostep(output, step1_statistic = 'lrteg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)
kable(sig_twostep(output, step1_statistic = 'lrteg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | step1_pval | step2_pval | |
|---|---|---|---|
| 940 | SNP.943 | 4.8494e-06 | 1.2948e-10 |
# Two-step method - filter by EDGE statistic
plot_twostep(output, step1_statistic = 'lrtedge', step1_df = 2, sizeBin0 = 5, alpha = 0.05)
kable(sig_twostep(output, step1_statistic = 'lrtedge', step1_df = 2, sizeBin0 = 5, alpha = 0.05)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | step1_pval | step2_pval | |
|---|---|---|---|
| 940 | SNP.943 | 2.5832e-06 | 1.2948e-10 |
| 276 | SNP.278 | 5.7617e-06 | 2.6613e-04 |
| 499 | SNP.502 | 1.8687e-05 | 1.6348e-04 |
| 897 | SNP.900 | 2.4656e-02 | 2.4526e-05 |
plot_qq(output, 'lrtdg', 1)
plot_qq(output, 'lrtgxe', 1, print_yloc = 2.5)
plot_qq(output, 'lrtcase', 1, print_yloc = 2)
plot_qq(output, 'lrt2df', 2)
plot_qq(output, 'lrt3df', 3)
plot_manhattan(output, 'lrtdg', 1)
kable(sig_table(output, 'lrtdg', 1)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | lrtdg_pval | lrtgxe_pval | lrt2df_pval | lrteg_pval | lrtctrl_pval | lrtcase_pval | lrt3df_pval | |
|---|---|---|---|---|---|---|---|---|
| 181 | SNP.183 | 1.0454e-07 | 7.8142e-03 | 2.0937e-08 | 8.4334e-01 | 2.7144e-02 | 6.0087e-02 | 1.0015e-07 |
| 276 | SNP.278 | 9.5299e-07 | 2.6613e-04 | 7.8886e-09 | 7.4277e-01 | 6.9956e-03 | 2.1574e-02 | 3.7436e-08 |
| 295 | SNP.297 | 6.9823e-11 | 6.7325e-01 | 5.3387e-10 | 7.6765e-01 | 8.7742e-01 | 9.5139e-01 | 2.7283e-09 |
| 499 | SNP.502 | 1.0324e-05 | 1.6348e-04 | 4.9055e-08 | 1.2732e-01 | 2.3466e-01 | 9.6675e-04 | 7.5414e-08 |
| 570 | SNP.573 | 4.5195e-23 | 8.7209e-01 | 5.5863e-22 | 1.9182e-01 | 6.5456e-01 | 4.4153e-01 | 1.9165e-21 |
| 648 | SNP.651 | 1.1621e-08 | 9.7097e-02 | 2.1593e-08 | 6.1608e-01 | 1.2934e-01 | 1.5185e-01 | 9.3075e-08 |
plot_manhattan(output, 'lrtgxe', 1)
kable(sig_table(output, 'lrtgxe', 1)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | lrtdg_pval | lrtgxe_pval | lrt2df_pval | lrteg_pval | lrtctrl_pval | lrtcase_pval | lrt3df_pval | |
|---|---|---|---|---|---|---|---|---|
| 897 | SNP.900 | 9.7750e-03 | 2.4526e-05 | 4.8418e-06 | 3.9287e-01 | 6.5926e-03 | 1.5440e-04 | 1.3979e-05 |
| 940 | SNP.943 | 2.7853e-02 | 1.2948e-10 | 9.5035e-11 | 4.8494e-06 | 6.0080e-01 | NA | 1.8276e-14 |
plot_manhattan(output, 'lrtcase', 1)
plot_manhattan(output, 'lrt2df', 2)
plot_manhattan(output, 'lrt3df', 3)
plot_twostep(output, step1_statistic = 'lrtdg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)
kable(sig_twostep(output, step1_statistic = 'lrtdg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | step1_pval | step2_pval | |
|---|---|---|---|
| 276 | SNP.278 | 9.5299e-07 | 2.6613e-04 |
| 499 | SNP.502 | 1.0324e-05 | 1.6348e-04 |
| 897 | SNP.900 | 9.7750e-03 | 2.4526e-05 |
| 940 | SNP.943 | 2.7853e-02 | 1.2948e-10 |
plot_twostep(output, step1_statistic = 'lrteg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)
kable(sig_twostep(output, step1_statistic = 'lrteg', step1_df = 1, sizeBin0 = 5, alpha = 0.05)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | step1_pval | step2_pval | |
|---|---|---|---|
| 940 | SNP.943 | 4.8494e-06 | 1.2948e-10 |
plot_twostep(output, step1_statistic = 'lrtedge', step1_df = 2, sizeBin0 = 5, alpha = 0.05)
kable(sig_twostep(output, step1_statistic = 'lrtedge', step1_df = 2, sizeBin0 = 5, alpha = 0.05)) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = 'left')
| snp | step1_pval | step2_pval | |
|---|---|---|---|
| 940 | SNP.943 | 2.5832e-06 | 1.2948e-10 |
| 276 | SNP.278 | 5.7617e-06 | 2.6613e-04 |
| 499 | SNP.502 | 1.8687e-05 | 1.6348e-04 |
| 897 | SNP.900 | 2.4656e-02 | 2.4526e-05 |
est <- readRDS("~/Desktop/snp.943_or.rds")
options(knitr.kable.NA = '')
kable(est) %>%
kable_styling('bordered', bootstrap_options = c("striped", "hover", "condensed"), full_width = F, position = 'left') %>%
add_header_above(c(" " = 4, "G param by E" = 2, "Counts (Ca/Co)" = 3)) %>%
pack_rows("E param by G", 5, 6, indent = F)